Overview

Objective

This is an exploratory analysis on three classes of students scores in Indonesian language subject. These classes were under the tutelage of A. N. in MAN 3 Lebak and I hope I can discover something of value from this analysis.

Data

The data consists of prepared tables that I had processed in excel, they are:

  1. harian table, that presents 4 sets of minor tests result that each consists of proyek, portofolio, and praktek;

  2. phbo pat table that presents 2 sets of major (half and last semester) tests result.

Preparation

Library

library(readxl)
library(tidyverse)
library(tidymodels)
library(encryptr)
library(janitor)
library(skimr)
library(plotly)
library(glue)
library(GGally)
library(tidytext)
library(FactoMineR)
library(ggiraphExtra)
library(factoextra)

Import

harian <- read_excel("edit_bi_kelas_noni.xlsx", 
    sheet = "harian")
semesteran <- read_excel("edit_bi_kelas_noni.xlsx", 
    sheet = "phbo pat")

Clean the column names.

harian <- clean_names(harian)
semesteran <- clean_names(semesteran)

Encryption

Encrypt the name of students to protect privacy.

Generate Keys

#genkeys()

Encrypt Names

Create codes for names.

names_encryption <- 
  harian %>% 
  select(nama_siswa) %>%
  unique() %>% 
  mutate(
    name_code = nama_siswa
  ) %>% 
  encrypt(name_code)

Replace real names in nama_siswa with the codes.

harian_encrypted <- 
  harian %>% 
  left_join(names_encryption, 
            by = c("nama_siswa" = "nama_siswa"),
            keep = F) %>% 
  select(-nama_siswa) %>% 
  select(name_code, everything())


harian_encrypted %>% head()
## # A tibble: 6 x 8
##   name_code                  kelas penil~1 materi nilai~2 proyek praktek porto~3
##   <chr>                      <chr> <chr>   <chr>    <dbl>  <dbl>   <dbl>   <dbl>
## 1 2d10f59d016483c1cce2a1ff0~ XI I~ Harian~ Propo~      78     78      78      78
## 2 a7bb645682646965a7b3998ea~ XI I~ Harian~ Propo~      78     78      78      78
## 3 13d13d8ba02bea54041a02584~ XI I~ Harian~ Propo~      78     78      78      78
## 4 a983d069d7b7ddab8cddc8fa8~ XI I~ Harian~ Propo~      78     78      78      78
## 5 3ec8b7095e42dddd7c1bedeb2~ XI I~ Harian~ Propo~      81     80      83      80
## 6 7d8a0ad9554e63d9cb507d5d5~ XI I~ Harian~ Propo~      78     78      78      78
## # ... with abbreviated variable names 1: penilaian, 2: nilai_ph, 3: portofolio

Compare total unique nama_siswa and unique name_code to check.

n_distinct(harian$nama_siswa)
## [1] 82
n_distinct(harian_encrypted$name_code)
## [1] 82

Encrypt semesteran table as well.

semesteran_encrypted <- 
  semesteran %>% 
  left_join(names_encryption, 
            by = c("nama_siswa" = "nama_siswa"),
            keep = F) %>% 
  select(-nama_siswa) %>% 
  select(name_code, everything())


semesteran_encrypted %>% head()
## # A tibble: 6 x 4
##   name_code                                                  kelas penil~1 nilai
##   <chr>                                                      <chr> <chr>   <dbl>
## 1 2d10f59d016483c1cce2a1ff066a05fc8164028f9df861ffbd105fc1a~ XI I~ PHBO       78
## 2 a7bb645682646965a7b3998ea7e088f0b92d785f5ce846543bd924b8a~ XI I~ PHBO       78
## 3 13d13d8ba02bea54041a02584536224960dce9f9799ad5fe6ded7602d~ XI I~ PHBO       78
## 4 a983d069d7b7ddab8cddc8fa8be7aa2d6ed1ea3be890a68e187e025f5~ XI I~ PHBO       78
## 5 3ec8b7095e42dddd7c1bedeb26ca5ebd32b1ba608295ea21174b4fcf3~ XI I~ PHBO       98
## 6 7d8a0ad9554e63d9cb507d5d5a8c4b9fabf8aad4f8589160141dbf317~ XI I~ PHBO       78
## # ... with abbreviated variable name 1: penilaian
n_distinct(semesteran$nama_siswa)
## [1] 82
n_distinct(semesteran_encrypted$name_code)
## [1] 82

I had processed it in excel, but let’s check missing data again.

anyNA(harian_encrypted)
## [1] FALSE
anyNA(semesteran_encrypted)
## [1] FALSE

Shuffle

Shuffle the row so the real names can’t be identified using the order of the original rows.

harian_encrypted <- harian_encrypted %>% slice_sample(prop = 1)
semesteran_encrypted <- semesteran_encrypted %>% slice_sample(prop = 1)

Analysis

Exploratory

glimpse(harian_encrypted)
## Rows: 328
## Columns: 8
## $ name_code  <chr> "664903989b60c806285a1419335b6687beee051dc0b33c3e6d55b71722~
## $ kelas      <chr> "XI IPS 1", "XI MIPA", "XI IPS 2", "XI MIPA", "XI MIPA", "X~
## $ penilaian  <chr> "Harian 4", "Harian 2", "Harian 2", "Harian 2", "Harian 2",~
## $ materi     <chr> "Drama", "Karya Ilmiah", "Karya Ilmiah", "Karya Ilmiah", "K~
## $ nilai_ph   <dbl> 85.00000, 86.00000, 78.00000, 86.00000, 78.00000, 85.00000,~
## $ proyek     <dbl> 85, 90, 78, 90, 78, 85, 78, 78, 85, 78, 78, 78, 78, 78, 78,~
## $ praktek    <dbl> 85, 78, 78, 78, 78, 85, 78, 78, 85, 78, 78, 78, 78, 78, 78,~
## $ portofolio <dbl> 85, 90, 78, 90, 78, 85, 78, 78, 85, 78, 78, 78, 78, 78, 78,~
glimpse(semesteran_encrypted)
## Rows: 164
## Columns: 4
## $ name_code <chr> "13d13d8ba02bea54041a02584536224960dce9f9799ad5fe6ded7602df6~
## $ kelas     <chr> "XI IPS 2", "XI MIPA", "XI IPS 1", "XI IPS 1", "XI IPS 1", "~
## $ penilaian <chr> "PAT", "PHBO", "PAT", "PHBO", "PHBO", "PAT", "PAT", "PHBO", ~
## $ nilai     <dbl> 85, 78, 85, 88, 78, 86, 84, 78, 85, 83, 86, 90, 84, 90, 83, ~

We can join the tables with name_code.

data <-
  harian_encrypted %>% left_join(semesteran_encrypted[,c(1, 3, 4)], 
                                 by = "name_code",
                                 suffix = c("_h", "_s"))
glimpse(data)
## Rows: 656
## Columns: 10
## $ name_code   <chr> "664903989b60c806285a1419335b6687beee051dc0b33c3e6d55b7172~
## $ kelas       <chr> "XI IPS 1", "XI IPS 1", "XI MIPA", "XI MIPA", "XI IPS 2", ~
## $ penilaian_h <chr> "Harian 4", "Harian 4", "Harian 2", "Harian 2", "Harian 2"~
## $ materi      <chr> "Drama", "Drama", "Karya Ilmiah", "Karya Ilmiah", "Karya I~
## $ nilai_ph    <dbl> 85, 85, 86, 86, 78, 78, 86, 86, 78, 78, 85, 85, 78, 78, 78~
## $ proyek      <dbl> 85, 85, 90, 90, 78, 78, 90, 90, 78, 78, 85, 85, 78, 78, 78~
## $ praktek     <dbl> 85, 85, 78, 78, 78, 78, 78, 78, 78, 78, 85, 85, 78, 78, 78~
## $ portofolio  <dbl> 85, 85, 90, 90, 78, 78, 90, 90, 78, 78, 85, 85, 78, 78, 78~
## $ penilaian_s <chr> "PAT", "PHBO", "PHBO", "PAT", "PHBO", "PAT", "PAT", "PHBO"~
## $ nilai       <dbl> 87, 95, 89, 84, 78, 78, 85, 91, 83, 95, 86, 85, 90, 86, 94~

Convert all the chr columns except name_code, to factor.

data <- data %>% 
  mutate(across(.cols = c(where(is.character), -name_code),.fns = as.factor))
glimpse(data)
## Rows: 656
## Columns: 10
## $ name_code   <chr> "664903989b60c806285a1419335b6687beee051dc0b33c3e6d55b7172~
## $ kelas       <fct> XI IPS 1, XI IPS 1, XI MIPA, XI MIPA, XI IPS 2, XI IPS 2, ~
## $ penilaian_h <fct> Harian 4, Harian 4, Harian 2, Harian 2, Harian 2, Harian 2~
## $ materi      <fct> Drama, Drama, Karya Ilmiah, Karya Ilmiah, Karya Ilmiah, Ka~
## $ nilai_ph    <dbl> 85, 85, 86, 86, 78, 78, 86, 86, 78, 78, 85, 85, 78, 78, 78~
## $ proyek      <dbl> 85, 85, 90, 90, 78, 78, 90, 90, 78, 78, 85, 85, 78, 78, 78~
## $ praktek     <dbl> 85, 85, 78, 78, 78, 78, 78, 78, 78, 78, 85, 85, 78, 78, 78~
## $ portofolio  <dbl> 85, 85, 90, 90, 78, 78, 90, 90, 78, 78, 85, 85, 78, 78, 78~
## $ penilaian_s <fct> PAT, PHBO, PHBO, PAT, PHBO, PAT, PAT, PHBO, PAT, PHBO, PAT~
## $ nilai       <dbl> 87, 95, 89, 84, 78, 78, 85, 91, 83, 95, 86, 85, 90, 86, 94~
skim(data = data)
Data summary
Name data
Number of rows 656
Number of columns 10
_______________________
Column type frequency:
character 1
factor 4
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name_code 0 1 512 512 0 82 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
kelas 0 1 FALSE 3 XI : 256, XI : 208, XI : 192
penilaian_h 0 1 FALSE 4 Har: 164, Har: 164, Har: 164, Har: 164
materi 0 1 FALSE 4 Dra: 164, Kar: 164, Pro: 164, Res: 164
penilaian_s 0 1 FALSE 2 PAT: 328, PHB: 328

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
nilai_ph 0 1 79.92 3.15 78 78 78 81.67 92.67 ▇▁▂▁▁
proyek 0 1 80.19 4.01 78 78 78 80.00 100.00 ▇▂▁▁▁
praktek 0 1 79.70 3.03 78 78 78 80.00 90.00 ▇▁▂▁▁
portofolio 0 1 79.88 3.83 78 78 78 80.00 100.00 ▇▁▁▁▁
nilai 0 1 84.57 5.33 78 78 85 88.00 98.00 ▇▇▅▂▁
summary(data)
##   name_code              kelas       penilaian_h                materi   
##  Length:656         XI IPS 1:208   Harian 1:164   Drama            :164  
##  Class :character   XI IPS 2:192   Harian 2:164   Karya Ilmiah     :164  
##  Mode  :character   XI MIPA :256   Harian 3:164   Proposal Kegiatan:164  
##                                    Harian 4:164   Resensi          :164  
##                                                                          
##                                                                          
##     nilai_ph         proyek          praktek       portofolio     penilaian_s
##  Min.   :78.00   Min.   : 78.00   Min.   :78.0   Min.   : 78.00   PAT :328   
##  1st Qu.:78.00   1st Qu.: 78.00   1st Qu.:78.0   1st Qu.: 78.00   PHBO:328   
##  Median :78.00   Median : 78.00   Median :78.0   Median : 78.00              
##  Mean   :79.92   Mean   : 80.19   Mean   :79.7   Mean   : 79.88              
##  3rd Qu.:81.67   3rd Qu.: 80.00   3rd Qu.:80.0   3rd Qu.: 80.00              
##  Max.   :92.67   Max.   :100.00   Max.   :90.0   Max.   :100.00              
##      nilai      
##  Min.   :78.00  
##  1st Qu.:78.00  
##  Median :85.00  
##  Mean   :84.57  
##  3rd Qu.:88.00  
##  Max.   :98.00

Nilai Rata-rata Harian per Kelas

data <- data %>% 
  pivot_longer(cols = 6:8,
               names_to = "ppp",
               values_to = "nilai_ppp") 
data_mean_ph <-
  data %>% 
  group_by(kelas, penilaian_h, materi, ppp) %>% 
  summarise(rata2_ph = round(mean(nilai_ph),digits = 2)) %>% 
  mutate(avg = glue("Nilai Rata-rata {rata2_ph}"))
acols <- c("Harian 1" = "#C6E0FF", 
           "Harian 2" = "#579A9E",
           "Harian 3" = "#3292C3",
           "Harian 4" = "#BCAB79")

rata.plot <- ggplot(data_mean_ph, aes(y = kelas, x = rata2_ph, fill = penilaian_h))+
  geom_col(aes(text = avg))+
  facet_wrap(~penilaian_h)+
  theme_minimal()+
  theme(legend.position = "none",
        axis.title.y = element_blank())+
  labs(title = "Perbandingan Rata-rata Nilai Harian Pelajaran Bahasa",
       subtitle = "2020/2021 Semester Genap",
       x = "Nilai")+
  scale_fill_manual(
    values = acols      
  )+
  xlim(0, 100)

#rata.plot
ggplotly(rata.plot, tooltip = "text")

Nilai Ranah Keterampilan

data <- data %>% 
  mutate( # repair ppp content
    ppp = str_replace(ppp, "p", "P" ) 
  ) %>% 
  mutate( # limit digits
    nilai_ph = round(nilai_ph, digits = 2)
  ) %>% 
    mutate( # creating tooltip
    exp = glue(
      "{kelas} 
      {penilaian_h}: {nilai_ph}
      {ppp}: {nilai_ppp}"
    )
  )
fcols <- c("XI IPS 1" = "#BEEF9E", 
           "XI IPS 2" = "#3292C3",
           "XI MIPA" = "#BCAB79")

ppp.plot <- ggplot(data, aes(y = materi, 
                 x = nilai_ppp, 
                 fill = kelas))+
  geom_col(position = position_dodge(width = 0.95),
           aes(text = exp))+
  facet_wrap(~ppp)+
  theme_minimal()+
  theme(legend.position = "none",
        axis.title.y = element_blank())+
  labs(title = "Perbandingan Rincian Nilai Tugas Pelajaran Bahasa",
       subtitle = "2020/2021 Semester Genap",
       x = "Nilai")+
  scale_fill_manual(
    values = fcols      
  )

#ppp.plot
ggplotly(ppp.plot, tooltip = "text")

Nilai harian x ppp

hxp.plot <- 
  ggplot(data, 
         aes(x = nilai_ppp, 
             y = nilai_ph))+
  geom_jitter(aes(col = kelas,
                  text = exp),
              width = 0.3, 
              height = 0.3, 
              alpha = 0.7)+
  theme_minimal()+
  labs(title = "Hubungan Nilai PPP dan Penilaian Harian",
       x = "Nilai PPP",
       y = "Penilaian Harian")+
    scale_color_manual(
    values = fcols      
  )+
  theme(legend.position = "none",
        axis.title.y = element_blank())

  

#hxp.plot
ggplotly(hxp.plot, tooltip = "text")

Korelasi Nilai Penilaian Ranah Keterampilan dan Penilaian Harian

corp3ph.kelas.materi <- data %>% 
  group_by(kelas, materi) %>% 
  summarize(correlation = cor(nilai_ppp, nilai_ph))
corp3ph.kelas.materi
## # A tibble: 12 x 3
## # Groups:   kelas [3]
##    kelas    materi            correlation
##    <fct>    <fct>                   <dbl>
##  1 XI IPS 1 Drama                   1    
##  2 XI IPS 1 Karya Ilmiah            0.806
##  3 XI IPS 1 Proposal Kegiatan       0.813
##  4 XI IPS 1 Resensi                 0.918
##  5 XI IPS 2 Drama                   1    
##  6 XI IPS 2 Karya Ilmiah            0.805
##  7 XI IPS 2 Proposal Kegiatan       0.881
##  8 XI IPS 2 Resensi                 0.817
##  9 XI MIPA  Drama                   1    
## 10 XI MIPA  Karya Ilmiah            0.771
## 11 XI MIPA  Proposal Kegiatan       0.819
## 12 XI MIPA  Resensi                 0.882
corp3ph.kelas.materi %>% slice_max(correlation)
## # A tibble: 3 x 3
## # Groups:   kelas [3]
##   kelas    materi correlation
##   <fct>    <fct>        <dbl>
## 1 XI IPS 1 Drama            1
## 2 XI IPS 2 Drama            1
## 3 XI MIPA  Drama            1
corp3ph.kelas.materi %>% slice_min(correlation)
## # A tibble: 3 x 3
## # Groups:   kelas [3]
##   kelas    materi       correlation
##   <fct>    <fct>              <dbl>
## 1 XI IPS 1 Karya Ilmiah       0.806
## 2 XI IPS 2 Karya Ilmiah       0.805
## 3 XI MIPA  Karya Ilmiah       0.771
cor.plot.k.m <- 
ggplot(corp3ph.kelas.materi, 
       aes(x = materi, 
           y = correlation,
           text = kelas))+
  geom_col(aes(fill = kelas), 
           position = position_dodge()
           )+
    theme_minimal()+
  labs(title = "Korelasi Materi dan Nilai Harian",
       x = "Materi")+
    scale_fill_manual(
    values = fcols      
  )+
  theme(legend.position = "none",
        axis.title.y = element_blank())

#cor.plot.k.m
ggplotly(cor.plot.k.m, tooltip = "text")

Korelasi Penilaian Ranah Keterampilan dan Penilaian Harian

corp3ph.kelas.p3 <- data %>% 
  group_by(kelas, ppp) %>% 
  summarize(correlation = cor(nilai_ppp, nilai_ph))
corp3ph.kelas.p3
## # A tibble: 9 x 3
## # Groups:   kelas [3]
##   kelas    ppp        correlation
##   <fct>    <chr>            <dbl>
## 1 XI IPS 1 Portofolio       0.921
## 2 XI IPS 1 Praktek          0.732
## 3 XI IPS 1 Proyek           0.960
## 4 XI IPS 2 Portofolio       0.902
## 5 XI IPS 2 Praktek          0.731
## 6 XI IPS 2 Proyek           0.962
## 7 XI MIPA  Portofolio       0.937
## 8 XI MIPA  Praktek          0.621
## 9 XI MIPA  Proyek           0.949
corp3ph.kelas.p3 %>% slice_max(correlation)
## # A tibble: 3 x 3
## # Groups:   kelas [3]
##   kelas    ppp    correlation
##   <fct>    <chr>        <dbl>
## 1 XI IPS 1 Proyek       0.960
## 2 XI IPS 2 Proyek       0.962
## 3 XI MIPA  Proyek       0.949
corp3ph.kelas.p3 %>% slice_min(correlation)
## # A tibble: 3 x 3
## # Groups:   kelas [3]
##   kelas    ppp     correlation
##   <fct>    <chr>         <dbl>
## 1 XI IPS 1 Praktek       0.732
## 2 XI IPS 2 Praktek       0.731
## 3 XI MIPA  Praktek       0.621
cor.plot.k.p <- 
ggplot(corp3ph.kelas.p3, aes(ppp, correlation))+
    geom_col(aes(fill = kelas, text = kelas), 
           position = position_dodge()
           )+
    theme_minimal()+
  labs(title = "Korelasi Penilaian Ranah Keterampilan dan Penilaian Harian",
       subtitle = "Berdasarkan Proyek/Praktek/Portfolio",
       x = "Ranah Keterampilan")+
    scale_fill_manual(
    values = fcols      
  )+
  theme(legend.position = "none",
        axis.title.y = element_blank())

#cor.plot.k.p
ggplotly(cor.plot.k.p, tooltip = "text")

Perbandingan Nilai Semesteran Antar Kelas

plot.semester.k <-
ggplot(data, aes(x = nilai, text = exp))+
  geom_histogram(aes(fill = kelas),
                 binwidth = 1)+
  facet_wrap(~penilaian_s + kelas)+
  theme_minimal()+
  theme(legend.position = "none")+
  labs(title = "Perbandingan Sebaran Nilai Semesteran Antar Kelas",
       x = "Nilai",
       y = "Jumlah Siswa")+
  scale_fill_manual(
    values = fcols      
  )
#plot.semester.k
ggplotly(plot.semester.k, tooltip = "text")

Korelasi Nilai Harian dan Nilai Semesteran

k.kelas.ph.s <- 
  data %>% 
  group_by(kelas, materi) %>% 
  summarize(correlation = cor(nilai_ph, nilai))

k.kelas.ph.s
## # A tibble: 12 x 3
## # Groups:   kelas [3]
##    kelas    materi            correlation
##    <fct>    <fct>                   <dbl>
##  1 XI IPS 1 Drama                   0.557
##  2 XI IPS 1 Karya Ilmiah            0.396
##  3 XI IPS 1 Proposal Kegiatan       0.481
##  4 XI IPS 1 Resensi                 0.686
##  5 XI IPS 2 Drama                   0.285
##  6 XI IPS 2 Karya Ilmiah            0.182
##  7 XI IPS 2 Proposal Kegiatan       0.588
##  8 XI IPS 2 Resensi                 0.400
##  9 XI MIPA  Drama                   0.212
## 10 XI MIPA  Karya Ilmiah            0.338
## 11 XI MIPA  Proposal Kegiatan       0.330
## 12 XI MIPA  Resensi                 0.527
bcols <- c("Drama" = "#C6E0FF", 
           "Karya Ilmiah" = "#579A9E",
           "Proposal Kegiatan" = "#3292C3",
           "Resensi" = "#BCAB79")

plot.k.h.s <-
ggplot(k.kelas.ph.s, aes(x = kelas, 
                         y = correlation,
                         fill = materi,
                         text = materi))+
  geom_col(position = position_dodge())+
  scale_fill_manual(
    values = bcols
  )+
  theme_minimal()+
  theme(legend.position = "none",
        axis.title.x = element_blank(),
        axis.title.y = element_blank())+
  labs(title = "Korelasi Nilai Harian dan Semesteran",
       subtitle = "Berdasarkan Kelas dan Materi")
  

#plot.k.h.s
ggplotly(plot.k.h.s, tooltip = "text")

Korelasi Nilai Ranah Keterampilan dan Nilai Semesteran

k.kelas.ppp.s <- 
  data %>% 
  group_by(ppp, kelas) %>% 
  summarize(correlation = cor(nilai_ppp, nilai)) %>% 
  arrange(desc(correlation))

k.kelas.ppp.s
## # A tibble: 9 x 3
## # Groups:   ppp [3]
##   ppp        kelas    correlation
##   <chr>      <fct>          <dbl>
## 1 Proyek     XI IPS 1       0.512
## 2 Portofolio XI IPS 1       0.469
## 3 Praktek    XI IPS 1       0.363
## 4 Proyek     XI MIPA        0.344
## 5 Praktek    XI IPS 2       0.330
## 6 Portofolio XI MIPA        0.304
## 7 Proyek     XI IPS 2       0.294
## 8 Portofolio XI IPS 2       0.253
## 9 Praktek    XI MIPA        0.202
plot.ppp.k.s <-
ggplot(k.kelas.ppp.s, aes(x = ppp, 
                         y = correlation,
                         fill = kelas,
                         text = kelas))+
  geom_col(position = position_dodge())+
  scale_fill_manual(
    values = fcols
  )+
  theme_minimal()+
  theme(legend.position = "none",
        axis.title.x = element_blank(),
        axis.title.y = element_blank())+
  labs(title = "Korelasi Penilaian Ranah Keterampilan dan Nilai Semesteran",
       subtitle = "Berdasarkan Kelas")
  

#plot.ppp.k.s
ggplotly(plot.ppp.k.s, tooltip = "text")

Perbandingan Ranah Keterampilan Hingga Semesteran

data_edit_col <- 
  data %>% 
  mutate("Nilai Ranah Keterampilan" = nilai_ppp,
         "Nilai Harian" = nilai_ph,
         "Nilai Semesteran" = nilai,
         "Kelas" = kelas)

p.par.nilai <- 
ggparcoord(data_edit_col, 
           columns = c("Nilai Ranah Keterampilan", 
                       "Nilai Harian", 
                       "Nilai Semesteran"),
           groupColumn = "Kelas",
           scale = "uniminmax",
           order = c(11,12,13),
           showPoints = TRUE,
           splineFactor = 4,
           title = "Paralel Plot Nilai Bahasa Indonesia",
           alphaLines = 0.6)+ 
  scale_color_manual(values = fcols) +
  theme(
    axis.title.x = element_blank()
  )+
  theme_minimal()+
  labs(y = "Scaled with UniMinMax",
       col = "Kelas")
ggplotly(p.par.nilai, tooltip = "Kelas")

Perbandingan Korelasi

s.p3.h.s <-
data %>% 
  group_by(ppp, penilaian_h, penilaian_s) %>% 
  summarise(kor_ppp_ph = cor(nilai_ppp, nilai_ph),
            kor_ph_s = cor(nilai_ph, nilai))

s.p3.h.s
## # A tibble: 24 x 5
## # Groups:   ppp, penilaian_h [12]
##    ppp        penilaian_h penilaian_s kor_ppp_ph kor_ph_s
##    <chr>      <fct>       <fct>            <dbl>    <dbl>
##  1 Portofolio Harian 1    PAT              0.930    0.437
##  2 Portofolio Harian 1    PHBO             0.930    0.490
##  3 Portofolio Harian 2    PAT              1.00     0.257
##  4 Portofolio Harian 2    PHBO             1.00     0.377
##  5 Portofolio Harian 3    PAT              0.785    0.533
##  6 Portofolio Harian 3    PHBO             0.785    0.583
##  7 Portofolio Harian 4    PAT              1        0.364
##  8 Portofolio Harian 4    PHBO             1        0.381
##  9 Praktek    Harian 1    PAT              0.653    0.437
## 10 Praktek    Harian 1    PHBO             0.653    0.490
## # ... with 14 more rows
## # i Use `print(n = ...)` to see more rows

Impute missing values.

imputed.s.p3.h.s <-
  recipe(~ ., data = s.p3.h.s) %>%
  step_impute_bag(kor_ppp_ph) %>% 
  prep(s.p3.h.s)
imputed.s.p3.h.s 
## Recipe
## 
## Inputs:
## 
##       role #variables
##  predictor          5
## 
## Training data contained 24 data points and 2 incomplete rows. 
## 
## Operations:
## 
## Bagged tree imputation for kor_ppp_ph [trained]
df.imputed.s.p3.h.s <-
  bake(imputed.s.p3.h.s, new_data = s.p3.h.s) 
df.imputed.s.p3.h.s
## # A tibble: 24 x 5
## # Groups:   ppp, penilaian_h [12]
##    ppp        penilaian_h penilaian_s kor_ppp_ph kor_ph_s
##    <fct>      <fct>       <fct>            <dbl>    <dbl>
##  1 Portofolio Harian 1    PAT              0.930    0.437
##  2 Portofolio Harian 1    PHBO             0.930    0.490
##  3 Portofolio Harian 2    PAT              1.00     0.257
##  4 Portofolio Harian 2    PHBO             1.00     0.377
##  5 Portofolio Harian 3    PAT              0.785    0.533
##  6 Portofolio Harian 3    PHBO             0.785    0.583
##  7 Portofolio Harian 4    PAT              1        0.364
##  8 Portofolio Harian 4    PHBO             1        0.381
##  9 Praktek    Harian 1    PAT              0.653    0.437
## 10 Praktek    Harian 1    PHBO             0.653    0.490
## # ... with 14 more rows
## # i Use `print(n = ...)` to see more rows
anyNA(df.imputed.s.p3.h.s)
## [1] FALSE
pcols = c(
  "Portofolio" = "#BEEF9E",
  "Praktek" = "#3292C3",
  "Proyek" = "#BCAB79"
)
  
p.dens <-
ggplot(df.imputed.s.p3.h.s, aes(kor_ppp_ph, kor_ph_s)) +
  stat_density2d(geom="tile", aes(fill = ..density..), contour = FALSE) + 
  geom_point(size = 6, color = "white")+
  geom_point(aes(col=ppp, text = penilaian_s), size = 3)+
  labs(title = "Hubungan Nilai Korelasi",
       x = "Korelasi Nilai Ranah Keterampilan dan Nilai Harian",
       y = "Korelasi Nilai Harian dan Nilai Semesteran",
       color = "Keterampilan",
       fill = "Density"
       )+
  theme_minimal()+
  scale_color_manual(values = pcols)
  

#p.dens 
ggplotly(p.dens, tooltip = "text")

Uji Korelasi

uk.ppp.s <-
data %>% 
  select(kelas, nilai_ppp, nilai) %>% 
  nest(data = c(nilai_ppp, nilai)) %>% 
  mutate(
    test = map(data, ~ cor.test(.$nilai_ppp,
                               .$nilai)),
    tidied = map(test, tidy)
  ) %>% 
  unnest(cols = tidied) %>% 
  select(-data, -test)
uk.ppp.s
## # A tibble: 3 x 9
##   kelas    estimate statistic  p.value parameter conf.low conf.~1 method alter~2
##   <fct>       <dbl>     <dbl>    <dbl>     <int>    <dbl>   <dbl> <chr>  <chr>  
## 1 XI IPS 1    0.451     12.6  1.25e-32       622    0.386   0.512 Pears~ two.si~
## 2 XI MIPA     0.288      8.31 4.31e-16       766    0.221   0.351 Pears~ two.si~
## 3 XI IPS 2    0.289      7.23 1.58e-12       574    0.212   0.362 Pears~ two.si~
## # ... with abbreviated variable names 1: conf.high, 2: alternative
uk.ph.s <-
data %>% 
  select(kelas, nilai_ph, nilai) %>% 
  nest(data = c(nilai_ph, nilai)) %>% 
  mutate(
    test = map(data, ~ cor.test(.$nilai_ph,
                               .$nilai)),
    tidied = map(test, tidy)
  ) %>% 
  unnest(cols = tidied) %>% 
  select(-data, -test)
uk.ph.s
## # A tibble: 3 x 9
##   kelas    estimate statistic  p.value parameter conf.low conf.~1 method alter~2
##   <fct>       <dbl>     <dbl>    <dbl>     <int>    <dbl>   <dbl> <chr>  <chr>  
## 1 XI IPS 1    0.516     15.0  1.11e-43       622    0.456   0.571 Pears~ two.si~
## 2 XI MIPA     0.341     10.0  2.65e-22       766    0.276   0.402 Pears~ two.si~
## 3 XI IPS 2    0.332      8.44 2.54e-16       574    0.258   0.403 Pears~ two.si~
## # ... with abbreviated variable names 1: conf.high, 2: alternative

PCA

Preprocess

data_wide <-
  data %>%
  group_by(name_code) %>% 
  pivot_wider(
    names_from = ppp,
    values_from = nilai_ppp,
    values_fill = 0,
  ) %>% 
  pivot_wider(
    names_from = penilaian_h,
    values_from = nilai_ph,
    values_fill = 0
  ) %>% 
  pivot_wider(
    names_from = penilaian_s,
    values_from = nilai,
  ) %>% 
  summarise_all(mean) %>% 
  select(-c(kelas, materi, exp))
  
data_wide
## # A tibble: 82 x 10
##    name_code  Proyek Praktek Porto~1 Haria~2 Haria~3 Haria~4 Haria~5   PAT  PHBO
##    <chr>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl>
##  1 009c3fa52~   26      26      26      19.5    19.5    19.5    19.5    85    88
##  2 03bfff0bb~   26      26      26      19.5    19.5    19.5    19.5    85    78
##  3 0546753a8~   28.6    26.8    28.6    19.5    21.5    21.2    21.7    87    93
##  4 060bd7b14~   26.8    26.6    26.5    19.5    19.5    21      19.8    88    93
##  5 0b848e743~   26      26.6    26      19.5    19.5    19.5    20.1    84    78
##  6 0d9cce6d7~   30      27.3    29.8    21.2    23.2    21      21.7    91    93
##  7 0ec228726~   26      26      26      19.5    19.5    19.5    19.5    84    78
##  8 0f407cdc9~   26      26      26      19.5    19.5    19.5    19.5    83    78
##  9 13d13d8ba~   26      26      26      19.5    19.5    19.5    19.5    85    78
## 10 13d47c3b4~   26      26      26      19.5    19.5    19.5    19.5    86    87
## # ... with 72 more rows, and abbreviated variable names 1: Portofolio,
## #   2: `Harian 4`, 3: `Harian 2`, 4: `Harian 3`, 5: `Harian 1`
## # i Use `print(n = ...)` to see more rows

Use recipe to mark some column as ID, then normalise and process the rest.

process_rec <- # define recipe
  recipe(~., data = data_wide) %>% 
  update_role(name_code, new_role = "id") %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_normalize(all_predictors()) 

process_prep <- prep(process_rec) #evaluate recipe

Before applying pca, let’s look at the prepared data.

juice(process_prep)
## # A tibble: 82 x 10
##    name_code      Proyek Praktek Porto~1 Haria~2 Haria~3 Haria~4 Haria~5     PAT
##    <fct>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 009c3fa52d00~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.190 
##  2 03bfff0bb8d4~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.190 
##  3 0546753a8f53~  1.90    0.288    2.13   -0.639   1.76    1.92   2.16    0.766 
##  4 060bd7b14727~  0.0208  0.0271  -0.139  -0.639  -0.411   1.55  -0.304   1.05  
##  5 0b848e743bee~ -0.748   0.0271  -0.684  -0.639  -0.411  -0.715  0.0317 -0.0983
##  6 0d9cce6d7691~  3.35    1.20     3.40    1.55    3.57    1.55   2.16    1.92  
##  7 0ec228726269~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750  -0.0983
##  8 0f407cdc9537~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750  -0.386 
##  9 13d13d8ba02b~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.190 
## 10 13d47c3b44a7~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.478 
## # ... with 72 more rows, 1 more variable: PHBO <dbl>, and abbreviated variable
## #   names 1: Portofolio, 2: `Harian 4`, 3: `Harian 2`, 4: `Harian 3`,
## #   5: `Harian 1`
## # i Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names

Now let’s apply step_pca().

pca_rec <-
  process_rec %>% 
  step_pca(all_predictors())

pca_prep <- prep(pca_rec)
names(pca_prep)
##  [1] "var_info"       "term_info"      "steps"          "template"      
##  [5] "levels"         "retained"       "requirements"   "tr_info"       
##  [9] "orig_lvls"      "last_term_info"

Look at the pca results.

juice(pca_prep)
## # A tibble: 82 x 6
##    name_code                                 PC1     PC2     PC3     PC4     PC5
##    <fct>                                   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 009c3fa52d0061d1892df4db8b157787143c2~ -1.51  -0.0756  1.03   -0.367  -0.463 
##  2 03bfff0bb8d4586fb602da381c0b77c0d24b6~ -1.94   0.108   0.306   0.767  -0.207 
##  3 0546753a8f539848cea9f24c8f2a82caf6432~  4.02   1.93    0.919  -0.122   0.962 
##  4 060bd7b14727a7dd031ddd77cce6db37577c2~  0.725 -0.755   1.76   -0.487   0.554 
##  5 0b848e743bee93a24ec2478f05351d293f5f3~ -1.42   0.0102 -0.145   0.708   0.500 
##  6 0d9cce6d769144b8790139071d342aee4267f~  6.73   2.03   -0.0470  0.795  -1.21  
##  7 0ec2287262695dbaa3251e5df9c3aee0cee6c~ -2.01   0.185   0.120   0.589  -0.139 
##  8 0f407cdc953730852f9938ce8d325e71b8d23~ -2.09   0.261  -0.0662  0.411  -0.0704
##  9 13d13d8ba02bea54041a02584536224960dce~ -1.94   0.108   0.306   0.767  -0.207 
## 10 13d47c3b44a7b63db7b65d206fb1f0f0cd840~ -1.48  -0.134   1.14   -0.0753 -0.506 
## # ... with 72 more rows
## # i Use `print(n = ...)` to see more rows

PC Contributions to Variance Explained

Find standard deviation in each PC from steps.

sdev <- 
  pca_prep$steps[[3]]$res$sdev

Calculate percent of variation with standard deviation.

percent_variation <- sdev^2 / sum(sdev^2)
var_df <- data.frame(PC=paste0("PC",1:length(sdev)),
                     var_explained=percent_variation,
                     stringsAsFactors = FALSE)
var_df
##    PC var_explained
## 1 PC1  6.312045e-01
## 2 PC2  1.295237e-01
## 3 PC3  8.627531e-02
## 4 PC4  5.617098e-02
## 5 PC5  4.530960e-02
## 6 PC6  3.973402e-02
## 7 PC7  9.553585e-03
## 8 PC8  2.228303e-03
## 9 PC9  9.198070e-09

Plot.

var_df %>%
  mutate(PC = fct_inorder(PC),
         var_rounded = round(var_explained, digits = 2),
         var_cum = cumsum(var_rounded)) %>%
  ggplot(aes(x = PC,
             y= var_explained, 
             fill = PC))+
  geom_col()+ 
  scale_fill_manual(
    values = c(
      "#C6E0FF",
      "#579A9E",
      "#3292C3",
      "#BCAB79",
      "maroon",
      "white",
      "violet",
      "grey",
      "brown",
      "lightyellow"
    )
  )+
  geom_label(aes(label = var_cum))+
  labs(title = "PC Contributions to Variance Explained",
       x = NULL,
       y = "Var Explained")+
  theme(legend.position = "none")

It takes 4 PC to retain 91% variance explained.

tidied_pca <- tidy(pca_prep, 
                   number = 3) # the number of step in recipe, normalize is 2, pca is 3

Visualize Principal Components

tidied_pca %>%
  filter(component %in% paste0("PC", 1:6)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)+
  scale_fill_manual(
    values = c(
      "maroon",
      "white",
      "violet",
      "grey",
      "brown",
      "lightyellow",
      "#C6E0FF",
      "#579A9E",
      "#3292C3",
      "#BCAB79"
    )
  )

Focus on the first 4 PC.

tidied_pca %>%
  filter(component %in% paste0("PC", 1:4)) %>%
  group_by(component) %>%
  top_n(8, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, abs(value), component)) %>%
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col() +
  facet_wrap(~component, scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = "Positive?"
  )+
  scale_fill_manual(
    values = c("#BEEF9E", "#3292C3")
  )

The biggest divergence is at PC2, between Harian 2 and Harian 4.

# use juice() to transform int dataframe
juiced_pca <- juice(pca_prep)
juiced_pca 
## # A tibble: 82 x 6
##    name_code                                 PC1     PC2     PC3     PC4     PC5
##    <fct>                                   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 009c3fa52d0061d1892df4db8b157787143c2~ -1.51  -0.0756  1.03   -0.367  -0.463 
##  2 03bfff0bb8d4586fb602da381c0b77c0d24b6~ -1.94   0.108   0.306   0.767  -0.207 
##  3 0546753a8f539848cea9f24c8f2a82caf6432~  4.02   1.93    0.919  -0.122   0.962 
##  4 060bd7b14727a7dd031ddd77cce6db37577c2~  0.725 -0.755   1.76   -0.487   0.554 
##  5 0b848e743bee93a24ec2478f05351d293f5f3~ -1.42   0.0102 -0.145   0.708   0.500 
##  6 0d9cce6d769144b8790139071d342aee4267f~  6.73   2.03   -0.0470  0.795  -1.21  
##  7 0ec2287262695dbaa3251e5df9c3aee0cee6c~ -2.01   0.185   0.120   0.589  -0.139 
##  8 0f407cdc953730852f9938ce8d325e71b8d23~ -2.09   0.261  -0.0662  0.411  -0.0704
##  9 13d13d8ba02bea54041a02584536224960dce~ -1.94   0.108   0.306   0.767  -0.207 
## 10 13d47c3b44a7b63db7b65d206fb1f0f0cd840~ -1.48  -0.134   1.14   -0.0753 -0.506 
## # ... with 72 more rows
## # i Use `print(n = ...)` to see more rows
  ggplot(juiced_pca, aes(PC1, PC2)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(color = NULL)+
  theme_minimal()

  ggplot(juiced_pca, aes(PC1, PC3)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(color = NULL)+
  theme_minimal()

  ggplot(juiced_pca, aes(PC1, PC4)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(color = NULL)+
  theme_minimal()

PCA with FactoMiner

pca_rec2 <- # define recipe
  recipe(~., data = data_wide) %>% 
  update_role(name_code, new_role = "id") %>% 
  step_normalize(all_predictors()) %>% 
  prep()
d4facto <- juice(pca_rec2)
d4facto
## # A tibble: 82 x 10
##    name_code      Proyek Praktek Porto~1 Haria~2 Haria~3 Haria~4 Haria~5     PAT
##    <fct>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 009c3fa52d00~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.190 
##  2 03bfff0bb8d4~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.190 
##  3 0546753a8f53~  1.90    0.288    2.13   -0.639   1.76    1.92   2.16    0.766 
##  4 060bd7b14727~  0.0208  0.0271  -0.139  -0.639  -0.411   1.55  -0.304   1.05  
##  5 0b848e743bee~ -0.748   0.0271  -0.684  -0.639  -0.411  -0.715  0.0317 -0.0983
##  6 0d9cce6d7691~  3.35    1.20     3.40    1.55    3.57    1.55   2.16    1.92  
##  7 0ec228726269~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750  -0.0983
##  8 0f407cdc9537~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750  -0.386 
##  9 13d13d8ba02b~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.190 
## 10 13d47c3b44a7~ -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.478 
## # ... with 72 more rows, 1 more variable: PHBO <dbl>, and abbreviated variable
## #   names 1: Portofolio, 2: `Harian 4`, 3: `Harian 2`, 4: `Harian 3`,
## #   5: `Harian 1`
## # i Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
# library(FactoMineR)

facto_pca <- PCA(
  X = d4facto,
  scale.unit = F, 
  quali.sup = c(1), 
  ncp = 6, 
  graph = F #disable visualization
  )
head(facto_pca$ind$coord)
##        Dim.1       Dim.2       Dim.3      Dim.4      Dim.5      Dim.6
## 1 -1.5085935  0.07555772  1.03099615 -0.3669759  0.4632670  0.1705318
## 2 -1.9374308 -0.10820413  0.30631482  0.7674604  0.2073614 -0.1603130
## 3  4.0190829 -1.93467835  0.91855086 -0.1216915 -0.9618886 -0.2528085
## 4  0.7250594  0.75528856  1.75767341 -0.4873821 -0.5536845 -0.9457349
## 5 -1.4153958 -0.01020121 -0.14503239  0.7081886 -0.5000483  0.3525309
## 6  6.7304121 -2.03072138 -0.04698073  0.7946486  1.2067587 -0.1802793
plot.PCA(
  x = facto_pca, 
  choix = "ind", # individual factor map# coloring
  select = "contrib 1", # 5 most contributing factor
  invisible = "quali"
)

d4facto[71,]
## # A tibble: 1 x 10
##   name_code   Proyek Praktek Porto~1 Haria~2 Haria~3 Haria~4 Haria~5   PAT  PHBO
##   <fct>        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl>
## 1 9734ce6692~ -0.748  -0.887  -0.684  -0.639  -0.411  -0.715  -0.750 -1.83 -1.01
## # ... with abbreviated variable names 1: Portofolio, 2: `Harian 4`,
## #   3: `Harian 2`, 4: `Harian 3`, 5: `Harian 1`
thename <- data_wide[71,] %>% pull(name_code)
data_wide %>% 
  filter(name_code == thename)
## # A tibble: 1 x 10
##   name_code   Proyek Praktek Porto~1 Haria~2 Haria~3 Haria~4 Haria~5   PAT  PHBO
##   <chr>        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl>
## 1 9734ce6692~     26      26      26    19.5    19.5    19.5    19.5    78    78
## # ... with abbreviated variable names 1: Portofolio, 2: `Harian 4`,
## #   3: `Harian 2`, 4: `Harian 3`, 5: `Harian 1`
plot.PCA(
  x = facto_pca, 
  choix = "var",
  autoLab = "yes"
)

plot.PCA(
  x = facto_pca, 
  choix = "var",
  select = "contrib 1"
)

The “Ranah Keterampilan” that’s most correlated with the semester exam score (PAT) is “Praktek” assignment. But, the variable that has the most contribution the the two dimension the graph was built on, is “Proyek”.

K-means Clustering

Reuse d4facto, but without the first 3 columns.

d4kmeans <- d4facto %>% 
  select(-1)
head(d4kmeans)
## # A tibble: 6 x 9
##    Proyek Praktek Portofolio `Harian 4` Harian ~1 Haria~2 Haria~3     PAT   PHBO
##     <dbl>   <dbl>      <dbl>      <dbl>     <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
## 1 -0.748  -0.887      -0.684     -0.639    -0.411  -0.715 -0.750   0.190   0.477
## 2 -0.748  -0.887      -0.684     -0.639    -0.411  -0.715 -0.750   0.190  -1.01 
## 3  1.90    0.288       2.13      -0.639     1.76    1.92   2.16    0.766   1.22 
## 4  0.0208  0.0271     -0.139     -0.639    -0.411   1.55  -0.304   1.05    1.22 
## 5 -0.748   0.0271     -0.684     -0.639    -0.411  -0.715  0.0317 -0.0983 -1.01 
## 6  3.35    1.20        3.40       1.55      3.57    1.55   2.16    1.92    1.22 
## # ... with abbreviated variable names 1: `Harian 2`, 2: `Harian 3`,
## #   3: `Harian 1`

Find K

We have to decide the numbers of K. Let’s simulate 9 Ks.

kmsim <- 
tibble(k = 1:9)

kmsim
## # A tibble: 9 x 1
##       k
##   <int>
## 1     1
## 2     2
## 3     3
## 4     4
## 5     5
## 6     6
## 7     7
## 8     8
## 9     9

Create kmeans clustering for each number of K.

set.seed(1)
kmsim <- kmsim %>% 
mutate(
    # cluster the data 9 times
    kclust = map(k, ~ kmeans(d4kmeans, .))
  )

kmsim 
## # A tibble: 9 x 2
##       k kclust  
##   <int> <list>  
## 1     1 <kmeans>
## 2     2 <kmeans>
## 3     3 <kmeans>
## 4     4 <kmeans>
## 5     5 <kmeans>
## 6     6 <kmeans>
## 7     7 <kmeans>
## 8     8 <kmeans>
## 9     9 <kmeans>

Glance

Apply tidy, glance, and augment to each kclust.

set.seed(1)
kmsim <- kmsim %>% 
mutate(
  glanced = map(kclust, glance)
)
kmsim
## # A tibble: 9 x 3
##       k kclust   glanced         
##   <int> <list>   <list>          
## 1     1 <kmeans> <tibble [1 x 4]>
## 2     2 <kmeans> <tibble [1 x 4]>
## 3     3 <kmeans> <tibble [1 x 4]>
## 4     4 <kmeans> <tibble [1 x 4]>
## 5     5 <kmeans> <tibble [1 x 4]>
## 6     6 <kmeans> <tibble [1 x 4]>
## 7     7 <kmeans> <tibble [1 x 4]>
## 8     8 <kmeans> <tibble [1 x 4]>
## 9     9 <kmeans> <tibble [1 x 4]>

Unnest glanced

clustering <-
  kmsim %>% 
  unnest(cols = glanced)
clustering %>% head()
## # A tibble: 6 x 6
##       k kclust   totss tot.withinss betweenss  iter
##   <int> <list>   <dbl>        <dbl>     <dbl> <int>
## 1     1 <kmeans>   729         729.  5.68e-13     1
## 2     2 <kmeans>   729         390.  3.39e+ 2     1
## 3     3 <kmeans>   729         288.  4.41e+ 2     2
## 4     4 <kmeans>   729         243.  4.86e+ 2     3
## 5     5 <kmeans>   729         204.  5.25e+ 2     2
## 6     6 <kmeans>   729         183.  5.46e+ 2     3
ggplot(clustering, aes(k, tot.withinss))+
  geom_line()+
  geom_point()

There are 3 centers before the tot.withinss decrease slowed down. That’s a good enough elbow. So our kmeans is:

thek <- kmsim %>% 
  filter(k == 3) %>% 
  pull(kclust)

Assign the cluster identification to a new column in d4kmeans

d4kmeans$cluster <- thek[[1]][[1]]

Profiling

student_score_profile <- 
d4kmeans %>% 
  group_by(cluster) %>% 
  summarise_all(mean)
student_score_profile
## # A tibble: 3 x 10
##   cluster Proyek Praktek Portofo~1 Haria~2 Haria~3 Haria~4 Haria~5    PAT   PHBO
##     <int>  <dbl>   <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>
## 1       1  1.88    0.745     1.98    0.271   2.21    1.07    1.56   0.646  0.935
## 2       2  0.315   0.946     0.184   1.02   -0.324   0.681   0.144  0.512  0.607
## 3       3 -0.677  -0.724    -0.630  -0.639  -0.411  -0.665  -0.496 -0.457 -0.587
## # ... with abbreviated variable names 1: Portofolio, 2: `Harian 4`,
## #   3: `Harian 2`, 4: `Harian 3`, 5: `Harian 1`
library(ggiraphExtra)
ggRadar(
  data=student_score_profile,
  mapping = aes(colours = cluster),
  ylim = 0.8
)+
  facet_wrap(~cluster)

d4kmeans %>% 
  head()
## # A tibble: 6 x 10
##    Proyek Praktek Porto~1 Haria~2 Haria~3 Haria~4 Haria~5     PAT   PHBO cluster
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <int>
## 1 -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.190   0.477       3
## 2 -0.748  -0.887   -0.684  -0.639  -0.411  -0.715 -0.750   0.190  -1.01        3
## 3  1.90    0.288    2.13   -0.639   1.76    1.92   2.16    0.766   1.22        1
## 4  0.0208  0.0271  -0.139  -0.639  -0.411   1.55  -0.304   1.05    1.22        2
## 5 -0.748   0.0271  -0.684  -0.639  -0.411  -0.715  0.0317 -0.0983 -1.01        3
## 6  3.35    1.20     3.40    1.55    3.57    1.55   2.16    1.92    1.22        1
## # ... with abbreviated variable names 1: Portofolio, 2: `Harian 4`,
## #   3: `Harian 2`, 4: `Harian 3`, 5: `Harian 1`

Viz

#library(factoextra)
set.seed(1)
x <- kmeans(d4kmeans, 3)

fviz_cluster(object = x, 
             data = d4kmeans)

Combine

km_x_pca <- PCA(X = d4kmeans, # this table has already added with the cluster
               quali.sup = 10, 
               graph=F) 

fviz_pca_biplot(km_x_pca,
                habillage = "cluster",
                geom.ind = "point", # menampilkan titik observasi saja
                addEllipses = T, # membuat elips disekitar cluster
                col.var = "navy"
)

Summary: